import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# Import data
df = pd.read_csv("gapminder_clean.csv")
df.dropna(subset = ["CO2 emissions (metric tons per capita)"], inplace=True)
df.dropna(subset = ["gdpPercap"], inplace=True)
# Scatter plot
filtered_data = df[df["Year"] == 1962]
sns.scatterplot(data=filtered_data, x="CO2 emissions (metric tons per capita)", y="gdpPercap")
<AxesSubplot:xlabel='CO2 emissions (metric tons per capita)', ylabel='gdpPercap'>
# Pearson's r
from scipy.stats import pearsonr
corr, p_value = pearsonr(filtered_data["CO2 emissions (metric tons per capita)"], filtered_data["gdpPercap"])
print("\n Pearson correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap, year 1962: \n","Correlation value: ",corr,"p-value: ",p_value)
Pearson correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap, year 1962: Correlation value: 0.9260816725019472 p-value: 1.1286792210038754e-46
unfiltered_data = df[df["Year"] != 1962]
unfiltered_data.groupby(by=["Year"]).corrwith(other=df["CO2 emissions (metric tons per capita)"]).sort_values("gdpPercap", ascending=False)["gdpPercap"].head(1)
Year 1967 0.938792 Name: gdpPercap, dtype: float64
new_filtered_data = df[df["Year"] == 1967]
import plotly.express as px
fig = px.scatter(new_filtered_data, x='CO2 emissions (metric tons per capita)', y="gdpPercap", color="continent",
size='pop', hover_data=['Country Name'], title="CO2 emissions (metric tons per capita)' and gdpPercap. Year 1967")
fig.show()
What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)'?
df.groupby("continent")["Energy use (kg of oil equivalent per capita)"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| continent | ||||||||
| Africa | 198.0 | 700.642721 | 628.227685 | 9.715410 | 377.734680 | 451.382174 | 746.247275 | 3071.774832 |
| Americas | 188.0 | 1703.620453 | 2377.181918 | 219.075497 | 556.033108 | 749.029108 | 1384.585146 | 14608.009868 |
| Asia | 185.0 | 1867.280336 | 2590.043514 | 86.903767 | 345.370792 | 760.140852 | 1987.087308 | 12122.050603 |
| Europe | 239.0 | 3110.604287 | 1768.370162 | 350.101258 | 2045.782889 | 2954.266739 | 3853.373983 | 14746.031338 |
| Oceania | 20.0 | 3980.314420 | 1123.410756 | 1791.461322 | 3143.501420 | 4044.850674 | 4783.650230 | 5868.347097 |
fig = px.box(df, x="Energy use (kg of oil equivalent per capita)", y="continent")
fig.show()
I need to know if the data satisfy parametric requirements to use parametric tests.
americas_energy = df[df["continent"] == "Americas"]["Energy use (kg of oil equivalent per capita)"].dropna()
oceania_energy = df[df["continent"] == "Oceania"]["Energy use (kg of oil equivalent per capita)"].dropna()
africa_energy = df[df["continent"] == "Africa"]["Energy use (kg of oil equivalent per capita)"].dropna()
europe_energy = df[df["continent"] == "Europe"]["Energy use (kg of oil equivalent per capita)"].dropna()
asia_energy = df[df["continent"] == "Asia"]["Energy use (kg of oil equivalent per capita)"].dropna()
import scipy.stats as stats
#The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.
print("\n",
stats.shapiro(americas_energy),"\n",
stats.shapiro(oceania_energy),"\n",
stats.shapiro(africa_energy),"\n",
stats.shapiro(europe_energy),"\n",
stats.shapiro(asia_energy))
ShapiroResult(statistic=0.5632225871086121, pvalue=1.5868403861741054e-21) ShapiroResult(statistic=0.9818098545074463, pvalue=0.9552662372589111) ShapiroResult(statistic=0.6753993034362793, pvalue=2.724574480963581e-19) ShapiroResult(statistic=0.889901876449585, pvalue=3.4700315537650184e-12) ShapiroResult(statistic=0.6609910130500793, pvalue=4.943039538112358e-19)
Shapiro-Wilk tests: rejected 4/5 null hypotheses. Data is not normal distributed
import scipy.stats as stats
#The Levene test tests the null hypothesis that all input samples are from populations with equal variances.
stats.levene(*[americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy], center='median', proportiontocut=0.05)
LeveneResult(statistic=12.113489871462216, pvalue=1.4250677574810339e-09)
Leneve test: rejected null hypothesis.
# The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal.
fvalue, pvalue = stats.kruskal(*[americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy])
print(fvalue, pvalue)
302.0114932359461 3.989307514095183e-64
Kruskal-Wallis H-test: rejected null hypothesis.
# Post hoc pairwise test for multiple comparisons of mean rank sums (Dunn’s test).
# May be used after Kruskal-Wallis one-way analysis of variance by ranks to do pairwise comparisons
import scikit_posthocs as sp
dunn_test = sp.posthoc_dunn([americas_energy, oceania_energy, africa_energy,europe_energy, asia_energy])
dunn_test.columns =["americas_energy","oceania_energy","africa_energy","europe_energy","asia_energy"]
dunn_test.index =["americas_energy","oceania_energy","africa_energy","europe_energy","asia_energy"]
dunn_test
| americas_energy | oceania_energy | africa_energy | europe_energy | asia_energy | |
|---|---|---|---|---|---|
| americas_energy | 1.000000e+00 | 5.358423e-08 | 6.438289e-10 | 1.025511e-20 | 1.858753e-01 |
| oceania_energy | 5.358423e-08 | 1.000000e+00 | 4.148905e-16 | 1.125350e-01 | 1.779352e-09 |
| africa_energy | 6.438289e-10 | 4.148905e-16 | 1.000000e+00 | 9.844921e-58 | 1.479379e-06 |
| europe_energy | 1.025511e-20 | 1.125350e-01 | 9.844921e-58 | 1.000000e+00 | 1.130215e-26 |
| asia_energy | 1.858753e-01 | 1.779352e-09 | 1.479379e-06 | 1.130215e-26 | 1.000000e+00 |
fig = px.imshow(dunn_test)
fig.show()
Asia's and Americas' energy use means are similar. Oceania's and Europe's energy use means are similar.
Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990? (Stats test needed)
europe_and_asia_after_1990 = df[((df["continent"] == "Europe") |
(df["continent"] == "Asia")) & (df["Year"] > 1990) &
(df['Imports of goods and services (% of GDP)'] < 97 ) ] # Deleted Outliers (Singapore is an exception)
europe_and_asia_after_1990.groupby("continent")["Imports of goods and services (% of GDP)"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| continent | ||||||||
| Asia | 93.0 | 41.713928 | 23.454248 | 0.079506 | 25.393531 | 38.831290 | 58.350047 | 96.742045 |
| Europe | 111.0 | 41.761071 | 16.818978 | 17.345130 | 28.462519 | 37.691245 | 51.129504 | 88.512248 |
fig = px.box(europe_and_asia_after_1990, x="Imports of goods and services (% of GDP)", y="continent", hover_data=['Country Name', "Year"])
fig.show()
europe_imports =europe_and_asia_after_1990[europe_and_asia_after_1990["continent"] == "Europe"]["Imports of goods and services (% of GDP)"].dropna()
asia_imports = europe_and_asia_after_1990[europe_and_asia_after_1990["continent"] == "Asia"]["Imports of goods and services (% of GDP)"].dropna()
print("\n", stats.shapiro(europe_imports), "\n", stats.shapiro(asia_imports))
stats.levene(*[europe_imports,asia_imports], center='median', proportiontocut=0.05)
ShapiroResult(statistic=0.9278804063796997, pvalue=1.486705423303647e-05) ShapiroResult(statistic=0.9704142212867737, pvalue=0.032892148941755295)
LeveneResult(statistic=9.881593643140368, pvalue=0.0019207769451392669)
Shapiro-Wilk tests: rejected 1/2 null hypotheses. Data is not normal distributed Leneve test: rejected null hypothesis.
#The Mann-Whitney U test is used to compare differences between two independent groups when the dependent variable is either ordinal or continuous, but not normally distributed.
stats.mannwhitneyu(x=europe_imports, y=asia_imports)
MannwhitneyuResult(statistic=5034.0, pvalue=0.3811649093378452)
Can not reject the null hypothesis of identical average scores.
What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
fig = px.line(df, x="Year", y="Population density (people per sq. km of land area)", color="continent",
line_group="Country Name", hover_name="Country Name",
title="Population density (people per sq. km of land area) across all years")
fig.show()
What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?
grouped_df = df.groupby("Country Name")
first_values = grouped_df.first()
first_values = first_values.reset_index()
df_1 = first_values[["Country Name","Life expectancy at birth, total (years)"]]
last_values = grouped_df.last()
last_values = last_values.reset_index()
df_2 = last_values[["Country Name","Life expectancy at birth, total (years)"]]
df_2.rename(columns={"Life expectancy at birth, total (years)": "last"}, errors="raise")
df_3 =pd.concat([df_1,df_2], axis=1)
df_3.columns = ['Country Name', 'first_life', 'drop', "last_life"]
df_3 = df_3.drop("drop", axis=1)
df_3["delta"] = df_3['last_life'].sub(df_3['first_life'], axis = 0)
df_3.sort_values(by=['delta'], ascending=False).head(5)[["Country Name","delta"]]
| Country Name | delta | |
|---|---|---|
| 118 | Tunisia | 30.860756 |
| 81 | Nepal | 30.599634 |
| 24 | China | 29.942098 |
| 88 | Oman | 27.016537 |
| 99 | Saudi Arabia | 26.650561 |
df_3_greater_than_zero = df_3[df_3["delta"] > 0]
fig = px.scatter(df_3_greater_than_zero, x='delta', y="last_life", color="delta", title="Changes in life expectancy at birth",
size='delta', hover_data=['Country Name'],
labels={
"last_life": "Life expectancy at birth, total (years). Last record.",
"delta": "Difference between first and last record, (years)"
},)
fig.show()